home *** CD-ROM | disk | FTP | other *** search
- /* Copyright (c) 1994 Burra Gopal, Udi Manber. All Rights Reserved. */
-
- /*
- * string.c: String table manipulation routines. Can be used to compute
- * the dictionary as well as uncompress files.
- */
-
- #include "defs.h"
-
- extern int MAX_WORDS;
- extern int RESERVED_CHARS;
-
- int next_free_strtable = 0;
- char *free_strtable = NULL; /*[DEF_MAX_WORDS * AVG_WORD_LEN]; */
-
- extern int usemalloc;
-
- /* debugging only */
- int
- dump_string(string_table, string_file, index_file)
- char **string_table;
- unsigned char *string_file, *index_file;
- {
- FILE *stringfp;
- FILE *indexfp;
- int i;
-
- if ((stringfp = fopen(string_file, "w")) == NULL) {
- fprintf(stderr, "cannot open for writing: %s\n", string_file);
- return 0;
- }
- if ((indexfp = fopen(index_file, "r")) == NULL) {
- fprintf(stderr, "cannot open for reading: %s\n", index_file);
- fclose(stringfp);
- return 0;
- }
-
- for(i=0; i<MAX_WORDS; i++) fprintf(stringfp, "%s\n", string_table[i]);
-
- fflush(stringfp);
- fclose(stringfp);
- fclose(indexfp);
- return 1;
- }
-
- /*
- * VERY particular to the format of the string-table file: which is a series
- * of words separated by newlines -- this does an fscanf+strlen in one scan.
- */
- int
- mystringread(fp, str)
- FILE *fp;
- char *str;
- {
- int numread = 0;
- int c;
-
- while((numread <= MAX_WORD_LEN) && ((c = getc(fp)) != EOF)) {
- if (c == '\n') {
- if (numread==0) break; /* first char '\n' => in padded area */
- c = '\0';
- str[numread++] = c;
- return numread;
- }
- else str[numread++] = c;
- }
- str[numread] = '\0';
- if (c == EOF) return -1;
- return numread;
- }
-
- int
- build_string(string_table, stringfp, bytestoread, initialwordindex)
- char *string_table[DEF_MAX_WORDS]; /*[MAX_WORD_LEN+2]; */
- FILE *stringfp;
- int bytestoread;
- int initialwordindex;
- {
- int wordindex = initialwordindex;
- int numread = 0;
- int ret;
- char dummybuf[MAX_WORD_BUF];
- char *word;
-
- if (bytestoread == -1) { /* read until end of file */
- while (wordindex < MAX_WORDS) {
- if (usemalloc) word = dummybuf;
- else {
- if (free_strtable == NULL) free_strtable = (char *)malloc(AVG_WORD_LEN * DEF_MAX_WORDS);
- if (free_strtable == NULL) break;
- word = &free_strtable[next_free_strtable];
- }
- if ((ret = mystringread(stringfp, word)) == 0) continue;
- if (ret == -1) break;
- if (usemalloc) {
- if ((word = (char *)malloc(ret + 2)) == NULL) break;
- strcpy(word, dummybuf);
- }
- else next_free_strtable += ret + 2;
- string_table[wordindex] = word;
- #if 0
- printf("word=%s index=%d\n", string_table[wordindex], wordindex);
- #endif /*0*/
- wordindex ++;
- }
- }
- else { /* read only the specified number of bytes */
- while((wordindex < MAX_WORDS) && (bytestoread > numread)) {
- if (usemalloc) word = dummybuf;
- else {
- if (free_strtable == NULL) free_strtable = (char *)malloc(AVG_WORD_LEN * DEF_MAX_WORDS);
- if (free_strtable == NULL) break;
- word = &free_strtable[next_free_strtable];
- }
- if ((ret = mystringread(stringfp, word)) <= 0) break; /* quit if EOF OR if padded area */
- if (usemalloc) {
- if ((word = (char *)malloc(ret + 2)) == NULL) break;
- strcpy(word, dummybuf);
- }
- else next_free_strtable += ret + 2;
- string_table[wordindex] = word;
- #if 0
- printf("word=%s index=%d\n", string_table[wordindex], wordindex);
- #endif /*0*/
- wordindex ++;
- numread += ret;
- }
- }
- return wordindex;
- }
-
- /*
- * Interprets srcbuf as a set of srclen/2 short integers. It looks for all the
- * short-integers encoding words in the matched line and loads only those blocks
- * of the string table. Note: srcbuf must be aligned on a short-int boundary.
- */
- int
- build_partial_string(string_table, stringfp, srcbuf, srclen, linebuf, linelen, blocksize, loaded_string_table)
- char *string_table[DEF_MAX_WORDS]; /* [MAX_WORD_LEN+2]; */
- FILE *stringfp;
- unsigned char *srcbuf;
- int srclen;
- unsigned char *linebuf;
- int linelen;
- int blocksize;
- char loaded_string_table[STRING_FILE_BLOCKS];
- {
- unsigned char *srcpos;
- int blockindex = 0;
- unsigned short srcinit, srcend;
- unsigned short wordnums[MAX_NAME_LEN]; /* maximum pattern length */
- int numwordnums = 0;
- int i;
-
- /*
- * Find all the relevant wordnums in the line.
- */
- i = 0;
- while(i<linelen) {
- if (linebuf[i] < RESERVED_CHARS) {
- if (linebuf[i] == BEGIN_VERBATIM) {
- if (isascii(linebuf[i+1])) {
- while ((linebuf[i] != END_VERBATIM) && (i <linelen)) i ++;
- }
- else i ++; /* skip over the BEGIN_VERBATIM of non-ascii character */
- i ++; /* skip over the non-ascii character OR END_VERBATIM: let it overshoot linelen...its ok */
- }
- else i ++; /* skip over the character encoding a special word OR a special character */
- }
- else {
- wordnums[numwordnums] = (unsigned char)linebuf[i]; /* always big-endian compression */
- wordnums[numwordnums] <<= 8;
- wordnums[numwordnums] |= (unsigned char)linebuf[i+1];
- wordnums[numwordnums] = decode_index(wordnums[numwordnums]); /* roundabout to avoid buserr */
- numwordnums ++;
- i += sizeof(short);
- }
- }
-
- #if 0
- for (i=0; i<numwordnums; i++) printf("num%d=%d\n", i, wordnums[i]);
- getchar();
- #endif /*0*/
-
- srcpos = srcbuf;
- srcend = *((unsigned short *)srcpos);
- srcpos += sizeof(short);
- while (srcpos < srcbuf + srclen) {
- srcinit = srcend;
- srcend = *((unsigned short *)srcpos);
- srcpos += sizeof(short);
- #if 0
- printf("%d -- %d\n", srcinit, srcend);
- #endif /*0*/
- for (i=0; i<numwordnums; i++)
- if ((wordnums[i] >= srcinit) && (wordnums[i] <= srcend)) goto include_page;
-
- blockindex++;
- continue;
-
- include_page: /* Include it if any of the word-indices fit within this range */
- if (loaded_string_table[blockindex++]) continue;
- #if 0
- printf("build_partial_string: hashing words in page# %d\n", blockindex);
- #endif /*0*/
- loaded_string_table[blockindex - 1] = 1;
- fseek(stringfp, (blockindex-1)*blocksize, 0);
- build_string(string_table, stringfp, blocksize, srcinit);
- }
- return 0;
- }
-
- pad_string_file(filename, FILEBLOCKSIZE)
- unsigned char *filename;
- int FILEBLOCKSIZE;
- {
- FILE *outfp, *infp, *indexfp;
- int offset = 0, len;
- unsigned char buf[MAX_NAME_LEN];
- int pid = getpid();
- int i;
- unsigned short wordindex = 0;
-
- if ((infp = fopen(filename, "r")) == NULL) {
- fprintf(stderr, "cannot open for reading: %s\n", filename);
- exit(2);
- }
- sprintf(buf, "%s.index", filename);
- if ((indexfp = fopen(buf, "w")) == NULL) {
- fprintf(stderr, "cannot open for writing: %s\n", buf);
- fclose(infp);
- exit(2);
- }
- sprintf(buf, "%s.%d", filename, pid);
- if ((outfp = fopen(buf, "w")) == NULL) {
- fprintf(stderr, "cannot open for writing: %s\n", buf);
- fclose(infp);
- fclose(indexfp);
- exit(2);
- }
- if ((FILEBLOCKSIZE % MIN_BLOCKSIZE) != 0) {
- fprintf(stderr, "invalid block size %d: changing to %d\n", FILEBLOCKSIZE, MIN_BLOCKSIZE);
- FILEBLOCKSIZE = MIN_BLOCKSIZE;
- }
- fprintf(indexfp, "%d\n", FILEBLOCKSIZE);
-
- buf[0] = '\0';
- if ((char *)buf != fgets(buf, MAX_NAME_LEN, infp)) goto end_of_input;
- len = strlen((char *)buf);
- fputs(buf, outfp);
- fprintf(indexfp, "%d\n", wordindex);
- offset += len;
- wordindex ++;
-
- while(fgets(buf, MAX_NAME_LEN, infp) == (char *)buf) {
- len = strlen((char *)buf);
- if (offset + len > FILEBLOCKSIZE) {
- for (i=0; i<FILEBLOCKSIZE-offset; i++) /* fill up with so many newlines until the next block size */
- putc('\n', outfp);
- fputs(buf, outfp);
- fprintf(indexfp, "%d\n", wordindex);
- offset = 0;
- }
- else fputs(buf, outfp);
- offset += len;
- wordindex ++;
- }
- fprintf(indexfp, "%d\n", wordindex);
-
- end_of_input:
- fclose(infp);
- fflush(outfp);
- fclose(outfp);
- fflush(indexfp);
- fclose(indexfp);
- sprintf(buf, "mv %s.%d %s\n", filename, pid, filename);
- system(buf);
- return 0;
- }
-